1 Traumatic Brain injury

Use the kNN algorithm to provide a classification of the data in the TBI case study, (CaseStudy11_TBI). Determine an appropriate k, train and evaluate the performance of the classification model on the data. Report some model quality statistics for a couple of different values of k and use these to rank-order (and perhaps plot the classification results of) the models.

library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
tbi<-read_excel("I:/UBD PB/ZH5102 ML/ass4/CaseStudy11_TBI.xlsx")
str(tbi)
## tibble [46 × 19] (S3: tbl_df/tbl/data.frame)
##  $ id         : num [1:46] 1 2 3 4 5 6 7 8 9 10 ...
##  $ age        : num [1:46] 19 55 24 57 54 16 21 25 30 38 ...
##  $ sex        : chr [1:46] "Male" "Male" "Male" "Female" ...
##  $ mechanism  : chr [1:46] "Fall" "Blunt" "Fall" "Fall" ...
##  $ field.gcs  : chr [1:46] "10" "." "12" "4" ...
##  $ er.gcs     : chr [1:46] "10" "3" "12" "4" ...
##  $ icu.gcs    : chr [1:46] "10" "3" "8" "6" ...
##  $ worst.gcs  : chr [1:46] "10" "3" "8" "4" ...
##  $ 6m.gose    : chr [1:46] "5" "5" "7" "3" ...
##  $ 2013.gose  : num [1:46] 5 7 7 3 7 8 3 3 5 3 ...
##  $ skull.fx   : num [1:46] 0 1 1 1 0 1 1 0 1 1 ...
##  $ temp.injury: num [1:46] 1 1 0 1 1 1 0 1 1 1 ...
##  $ surgery    : num [1:46] 1 1 0 1 1 1 1 0 1 1 ...
##  $ spikes.hr  : chr [1:46] "." "168.74" "37.369999999999997" "4.3499999999999996" ...
##  $ min.hr     : chr [1:46] "." "14" "0" "0" ...
##  $ max.hr     : chr [1:46] "." "757" "351" "59" ...
##  $ acute.sz   : num [1:46] 1 0 0 0 0 0 0 0 0 0 ...
##  $ late.sz    : num [1:46] 1 1 0 0 0 1 0 1 1 1 ...
##  $ ever.sz    : num [1:46] 1 1 0 0 0 1 0 1 1 1 ...
tbi<-tbi[,-1]
tbi[tbi=="."] <- NA 
tbi<- as.data.frame(tbi)

tbi$sex <- as.factor(tbi$sex)
tbi$mechanism <- as.factor(tbi$mechanism)

library(mi)
## Warning: package 'mi' was built under R version 4.2.3
## Loading required package: Matrix
## Loading required package: stats4
## mi (Version 1.1, packaged: 2022-06-05 05:31:15 UTC; ben)
## mi  Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
## This program comes with ABSOLUTELY NO WARRANTY.
## This is free software, and you are welcome to redistribute it
## under the General Public License version 2 or later.
## Execute RShowDoc('COPYING') for details.
tbic<-complete.cases(tbi)
prop.table(table(tbic))
## tbic
## FALSE  TRUE 
##   0.5   0.5
mdf<-missing_data.frame(tbi)
## NOTE: The following pairs of variables appear to have the same missingness pattern.
##  Please verify whether they are in fact logically distinct variables.
##      [,1]      [,2]       
## [1,] "icu.gcs" "worst.gcs"
head(mdf)
##   age    sex    mechanism field.gcs er.gcs icu.gcs worst.gcs X6m.gose
## 1  19   Male         Fall        10     10      10        10        5
## 2  55   Male        Blunt      <NA>      3       3         3        5
## 3  24   Male         Fall        12     12       8         8        7
## 4  57 Female         Fall         4      4       6         4        3
## 5  54 Female Peds_vs_Auto        14     11       8         8        5
## 6  16 Female          MVA        13      7       7         7        7
##   X2013.gose skull.fx temp.injury surgery          spikes.hr min.hr max.hr
## 1          5        0           1       1               <NA>   <NA>   <NA>
## 2          7        1           1       1             168.74     14    757
## 3          7        1           0       0 37.369999999999997      0    351
## 4          3        1           1       1 4.3499999999999996      0     59
## 5          7        0           1       1              54.59      0    284
## 6          8        1           1       1              75.92      7    180
##   acute.sz late.sz ever.sz missing_field.gcs missing_er.gcs missing_icu.gcs
## 1        1       1       1             FALSE          FALSE           FALSE
## 2        0       1       1              TRUE          FALSE           FALSE
## 3        0       0       0             FALSE          FALSE           FALSE
## 4        0       0       0             FALSE          FALSE           FALSE
## 5        0       0       0             FALSE          FALSE           FALSE
## 6        0       1       1             FALSE          FALSE           FALSE
##   missing_worst.gcs missing_6m.gose missing_spikes.hr missing_min.hr
## 1             FALSE           FALSE              TRUE           TRUE
## 2             FALSE           FALSE             FALSE          FALSE
## 3             FALSE           FALSE             FALSE          FALSE
## 4             FALSE           FALSE             FALSE          FALSE
## 5             FALSE           FALSE             FALSE          FALSE
## 6             FALSE           FALSE             FALSE          FALSE
##   missing_max.hr
## 1           TRUE
## 2          FALSE
## 3          FALSE
## 4          FALSE
## 5          FALSE
## 6          FALSE
show(mdf)
## Object of class missing_data.frame with 46 observations on 18 variables
## 
## There are 7 missing data patterns
## 
## Append '@patterns' to this missing_data.frame to access the corresponding pattern for every observation or perhaps use table()
## 
##                              type missing method  model
## age                    continuous       0   <NA>   <NA>
## sex                        binary       0   <NA>   <NA>
## mechanism   unordered-categorical       0   <NA>   <NA>
## field.gcs   unordered-categorical       2    ppd mlogit
## er.gcs      unordered-categorical       2    ppd mlogit
## icu.gcs     unordered-categorical       1    ppd mlogit
## worst.gcs   unordered-categorical       1    ppd mlogit
## 6m.gose     unordered-categorical       5    ppd mlogit
## 2013.gose              continuous       0   <NA>   <NA>
## skull.fx                   binary       0   <NA>   <NA>
## temp.injury                binary       0   <NA>   <NA>
## surgery                    binary       0   <NA>   <NA>
## spikes.hr   unordered-categorical      18    ppd mlogit
## min.hr      unordered-categorical      18    ppd mlogit
## max.hr      unordered-categorical      18    ppd mlogit
## acute.sz                   binary       0   <NA>   <NA>
## late.sz                    binary       0   <NA>   <NA>
## ever.sz                    binary       0   <NA>   <NA>
## 
##                  family  link transformation
## age                <NA>  <NA>    standardize
## sex                <NA>  <NA>           <NA>
## mechanism          <NA>  <NA>           <NA>
## field.gcs   multinomial logit           <NA>
## er.gcs      multinomial logit           <NA>
## icu.gcs     multinomial logit           <NA>
## worst.gcs   multinomial logit           <NA>
## 6m.gose     multinomial logit           <NA>
## 2013.gose          <NA>  <NA>    standardize
## skull.fx           <NA>  <NA>           <NA>
## temp.injury        <NA>  <NA>           <NA>
## surgery            <NA>  <NA>           <NA>
## spikes.hr   multinomial logit           <NA>
## min.hr      multinomial logit           <NA>
## max.hr      multinomial logit           <NA>
## acute.sz           <NA>  <NA>           <NA>
## late.sz            <NA>  <NA>           <NA>
## ever.sz            <NA>  <NA>           <NA>
mdf<-change(mdf, y="field.gcs" , what = "imputation_method", to="pmm")
mdf<-change(mdf, y="er.gcs" , what = "imputation_method", to="pmm")
mdf<-change(mdf, y="icu.gcs" , what = "imputation_method", to="pmm")
mdf<-change(mdf, y="worst.gcs" , what = "imputation_method", to="pmm")
mdf<-change(mdf, y="6m.gose" , what = "imputation_method", to="pmm")
mdf<-change(mdf, y="min.hr" , what = "imputation_method", to="pmm")
mdf<-change(mdf, y="max.hr" , what = "imputation_method", to="pmm")

imputations<-mi(mdf, n.iter=1, n.chains=1, verbose=T)
tbim<- complete(imputations, 3)
tbim<-tbim[[1]]
tbim<-tbim[,-19:-26]
str(tbim)
## 'data.frame':    46 obs. of  18 variables:
##  $ age        : num  19 55 24 57 54 16 21 25 30 38 ...
##  $ sex        : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 1 2 2 2 2 ...
##  $ mechanism  : Factor w/ 7 levels "Bike_vs_Auto",..: 3 2 3 3 7 6 3 3 4 3 ...
##  $ field.gcs  : Factor w/ 12 levels "10","12","13",..: 1 6 2 7 4 3 6 6 6 6 ...
##  $ er.gcs     : Factor w/ 13 levels "10","11","12",..: 1 7 3 8 2 11 7 8 13 10 ...
##  $ icu.gcs    : Factor w/ 11 levels "0","10","11",..: 2 6 10 8 10 9 8 6 6 8 ...
##  $ worst.gcs  : Factor w/ 13 levels "0","10","11",..: 2 7 12 8 12 11 7 7 7 7 ...
##  $ X6m.gose   : Factor w/ 7 levels "2","3","4","5",..: 4 4 6 2 4 6 2 2 2 2 ...
##  $ X2013.gose : num  5 7 7 3 7 8 3 3 5 3 ...
##  $ skull.fx   : Factor w/ 2 levels "0","1": 1 2 2 2 1 2 2 1 2 2 ...
##  $ temp.injury: Factor w/ 2 levels "0","1": 2 2 1 2 2 2 1 2 2 2 ...
##  $ surgery    : Factor w/ 2 levels "0","1": 2 2 1 2 2 2 2 1 2 2 ...
##  $ spikes.hr  : Factor w/ 28 levels "1.28","1.66",..: 21 4 12 13 20 26 17 17 14 15 ...
##  $ min.hr     : Factor w/ 7 levels "0","14","3","30",..: 1 2 1 1 1 7 1 1 1 5 ...
##  $ max.hr     : Factor w/ 28 levels "107","1199","12",..: 24 25 16 21 12 7 4 28 18 1 ...
##  $ acute.sz   : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
##  $ late.sz    : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 1 2 2 2 ...
##  $ ever.sz    : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 1 2 2 2 ...
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
tbim<-tbim%>%mutate_at(c(2:8,10:18),as.integer)

Y <- ifelse (tbim$surgery == 1, "No","Yes")
X <- tbim[, -12]

t_train <- X[1:23, ]
t_test  <- X[24:46, ]
t_train_labels <- Y[1:23]  
t_test_labels  <- Y[24:46]
table(t_train_labels)
## t_train_labels
##  No Yes 
##   7  16
round(prop.table(table(t_train_labels)), digits=2)
## t_train_labels
##  No Yes 
## 0.3 0.7
round(prop.table(table(t_test_labels)), digits=2)
## t_test_labels
##   No  Yes 
## 0.43 0.57
library(class)
t_test_pred <- knn(train=t_train, test=t_test, cl=t_train_labels, k=3)
library(gmodels)
CT1<-CrossTable(x=t_test_labels, y=t_test_pred, prop.chisq = F);CT1
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  23 
## 
##  
##               | t_test_pred 
## t_test_labels |        No |       Yes | Row Total | 
## --------------|-----------|-----------|-----------|
##            No |         3 |         7 |        10 | 
##               |     0.300 |     0.700 |     0.435 | 
##               |     0.333 |     0.500 |           | 
##               |     0.130 |     0.304 |           | 
## --------------|-----------|-----------|-----------|
##           Yes |         6 |         7 |        13 | 
##               |     0.462 |     0.538 |     0.565 | 
##               |     0.667 |     0.500 |           | 
##               |     0.261 |     0.304 |           | 
## --------------|-----------|-----------|-----------|
##  Column Total |         9 |        14 |        23 | 
##               |     0.391 |     0.609 |           | 
## --------------|-----------|-----------|-----------|
## 
## 
## $t
##      y
## x     No Yes
##   No   3   7
##   Yes  6   7
## 
## $prop.row
##      y
## x            No       Yes
##   No  0.3000000 0.7000000
##   Yes 0.4615385 0.5384615
## 
## $prop.col
##      y
## x            No       Yes
##   No  0.3333333 0.5000000
##   Yes 0.6666667 0.5000000
## 
## $prop.tbl
##      y
## x            No       Yes
##   No  0.1304348 0.3043478
##   Yes 0.2608696 0.3043478
print(paste0("Prediction accuracy of model 't_test_pred' (k=3) is ", (CT1$prop.tbl[1,1]+CT1$prop.tbl[2,2]) ))
## [1] "Prediction accuracy of model 't_test_pred' (k=3) is 0.434782608695652"
library(e1071)
knntuning = tune.knn(x= t_train, y = as.factor(t_train_labels), k = 1:20)
knntuning
## 
## Parameter tuning of 'knn.wrapper':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##   k
##  15
## 
## - best performance: 0.3166667
summary(knntuning) # k=8
## 
## Parameter tuning of 'knn.wrapper':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##   k
##  15
## 
## - best performance: 0.3166667 
## 
## - Detailed performance results:
##     k     error dispersion
## 1   1 0.4833333  0.3464992
## 2   2 0.4666667  0.3406602
## 3   3 0.6333333  0.3583226
## 4   4 0.7333333  0.2960647
## 5   5 0.5000000  0.1924501
## 6   6 0.5500000  0.2490724
## 7   7 0.3666667  0.3122993
## 8   8 0.4666667  0.3406602
## 9   9 0.5000000  0.3042903
## 10 10 0.3666667  0.3122993
## 11 11 0.3666667  0.3122993
## 12 12 0.4666667  0.3406602
## 13 13 0.4000000  0.2854496
## 14 14 0.3666667  0.3122993
## 15 15 0.3166667  0.3282012
## 16 16 0.3166667  0.3282012
## 17 17 0.3166667  0.3282012
## 18 18 0.3166667  0.3282012
## 19 19 0.3166667  0.3282012
## 20 20 0.3166667  0.3282012
t_test_pred <- knn(train=t_train, test=t_test,  cl=t_train_labels, prob=T, k=2)
t_test_predBin <- ifelse(attributes(t_test_pred)$prob >0.6, "No", "Yes")
CT2 <- CrossTable(x=t_test_labels, y=t_test_predBin, prop.chisq = F); CT2
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  23 
## 
##  
##               | t_test_predBin 
## t_test_labels |        No |       Yes | Row Total | 
## --------------|-----------|-----------|-----------|
##            No |         3 |         7 |        10 | 
##               |     0.300 |     0.700 |     0.435 | 
##               |     0.333 |     0.500 |           | 
##               |     0.130 |     0.304 |           | 
## --------------|-----------|-----------|-----------|
##           Yes |         6 |         7 |        13 | 
##               |     0.462 |     0.538 |     0.565 | 
##               |     0.667 |     0.500 |           | 
##               |     0.261 |     0.304 |           | 
## --------------|-----------|-----------|-----------|
##  Column Total |         9 |        14 |        23 | 
##               |     0.391 |     0.609 |           | 
## --------------|-----------|-----------|-----------|
## 
## 
## $t
##      y
## x     No Yes
##   No   3   7
##   Yes  6   7
## 
## $prop.row
##      y
## x            No       Yes
##   No  0.3000000 0.7000000
##   Yes 0.4615385 0.5384615
## 
## $prop.col
##      y
## x            No       Yes
##   No  0.3333333 0.5000000
##   Yes 0.6666667 0.5000000
## 
## $prop.tbl
##      y
## x            No       Yes
##   No  0.1304348 0.3043478
##   Yes 0.2608696 0.3043478
print(paste0("Prediction accuracy of model 't_test_pred' (k=2) is ", (CT2$prop.tbl[1,1]+CT2$prop.tbl[2,2]) ))
## [1] "Prediction accuracy of model 't_test_pred' (k=2) is 0.434782608695652"
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
knnControl <- trainControl(
    method = "cv", ## cross validation
    number = 10,   ## 10-fold
    summaryFunction = twoClassSummary,
    classProbs = TRUE,
    verboseIter = FALSE
)

knn_model <- train(x=t_train, y=t_train_labels , metric = "ROC", method = "knn", tuneLength = 20, trControl = knnControl)
print(knn_model) # k = 13
## k-Nearest Neighbors 
## 
## 23 samples
## 17 predictors
##  2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 21, 22, 22, 22, 20, 20, ... 
## Resampling results across tuning parameters:
## 
##   k   ROC        Sens       Spec
##    5  0.3928571  0.1428571  0.75
##    7  0.4285714  0.1428571  0.85
##    9  0.5000000  0.0000000  0.95
##   11  0.3928571  0.0000000  1.00
##   13  0.4642857  0.0000000  1.00
##   15  0.4285714  0.0000000  1.00
##   17  0.4285714  0.0000000  1.00
##   19  0.3571429  0.0000000  1.00
##   21  0.5000000  0.0000000  1.00
##   23  0.5000000  0.0000000  1.00
##   25  0.5000000  0.0000000  1.00
##   27  0.5000000  0.0000000  1.00
##   29  0.5000000  0.0000000  1.00
##   31  0.5000000  0.0000000  1.00
##   33  0.5000000  0.0000000  1.00
##   35  0.5000000  0.0000000  1.00
##   37  0.5000000  0.0000000  1.00
##   39  0.5000000  0.0000000  1.00
##   41  0.5000000  0.0000000  1.00
##   43  0.5000000  0.0000000  1.00
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 43.
t_test_pred <- knn(train=t_train, test=t_test,  cl=t_train_labels, prob=T, k=9)
t_test_predBin <- ifelse(attributes(t_test_pred)$prob >0.6, "No", "Yes")
CT3 <- CrossTable(x=t_test_labels, y=t_test_predBin, prop.chisq = F); CT3
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  23 
## 
##  
##               | t_test_predBin 
## t_test_labels |        No |       Yes | Row Total | 
## --------------|-----------|-----------|-----------|
##            No |         7 |         3 |        10 | 
##               |     0.700 |     0.300 |     0.435 | 
##               |     0.500 |     0.333 |           | 
##               |     0.304 |     0.130 |           | 
## --------------|-----------|-----------|-----------|
##           Yes |         7 |         6 |        13 | 
##               |     0.538 |     0.462 |     0.565 | 
##               |     0.500 |     0.667 |           | 
##               |     0.304 |     0.261 |           | 
## --------------|-----------|-----------|-----------|
##  Column Total |        14 |         9 |        23 | 
##               |     0.609 |     0.391 |           | 
## --------------|-----------|-----------|-----------|
## 
## 
## $t
##      y
## x     No Yes
##   No   7   3
##   Yes  7   6
## 
## $prop.row
##      y
## x            No       Yes
##   No  0.7000000 0.3000000
##   Yes 0.5384615 0.4615385
## 
## $prop.col
##      y
## x            No       Yes
##   No  0.5000000 0.3333333
##   Yes 0.5000000 0.6666667
## 
## $prop.tbl
##      y
## x            No       Yes
##   No  0.3043478 0.1304348
##   Yes 0.3043478 0.2608696
print(paste0("Prediction accuracy of model 't_test_pred' (k=9) is ", (CT3$prop.tbl[1,1]+CT2$prop.tbl[2,2]) ))
## [1] "Prediction accuracy of model 't_test_pred' (k=9) is 0.608695652173913"
library(dplyr)
summaryPredictions <- predict(knn_model, newdata = t_test, type = "prob")
summaryPredictionsLabel <- ifelse (summaryPredictions$No > 0.6, "No", "Yes")
testDataPredSummary <- as.data.frame(cbind(trueLabels=t_test_labels, YesProb=summaryPredictions$Yes,NoProb=summaryPredictions$No, knnPredLabel=summaryPredictionsLabel))
print(paste0("Accuracy = ", 2*as.numeric(table(testDataPredSummary$trueLabels == testDataPredSummary $knnPredLabel)[2]), "%"))
## [1] "Accuracy = 26%"
mod2_TN <- CT2$prop.row[1, 1]  
mod2_FP <- CT2$prop.row[1, 2]
mod2_FN <- CT2$prop.row[2, 1]
mod2_TP <- CT2$prop.row[2, 2]

mod2_sensi <- mod2_TN/(mod2_TN+mod2_FP) 
mod2_speci <- mod2_TP/(mod2_TP+mod2_FN)
print(paste0("kNN model k=2 Sensitivity=", mod2_sensi))
## [1] "kNN model k=2 Sensitivity=0.3"
print(paste0("kNN model k=2 Specificity=", mod2_speci))
## [1] "kNN model k=2 Specificity=0.538461538461538"
confusionMatrix(as.factor(t_test_labels), as.factor(t_test_predBin))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No   7   3
##        Yes  7   6
##                                           
##                Accuracy : 0.5652          
##                  95% CI : (0.3449, 0.7681)
##     No Information Rate : 0.6087          
##     P-Value [Acc > NIR] : 0.7418          
##                                           
##                   Kappa : 0.1544          
##                                           
##  Mcnemar's Test P-Value : 0.3428          
##                                           
##             Sensitivity : 0.5000          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.7000          
##          Neg Pred Value : 0.4615          
##              Prevalence : 0.6087          
##          Detection Rate : 0.3043          
##    Detection Prevalence : 0.4348          
##       Balanced Accuracy : 0.5833          
##                                           
##        'Positive' Class : No              
## 
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(x = c("TN", "FN", "FP", "TP"),
  y = c(mod2_TN, mod2_FN, mod2_FP, mod2_TP),
  name = c("TN", "FN", "FP", "TP"), type = "bar", color=c("TN", "FN", "FP", "TP")) %>% 
  layout(title="Confusion Matrix", 
           legend=list(title=list(text='<b> Model k=2; Performance Metrics </b>')), 
           xaxis=list(title='Metrics'), yaxis=list(title='Probability'))

2 Parkinson’s Disease

Use 05_PPMI_top_UPDRS_Integrated_LongFormat1 data to practice KNN classification.

ppmi<-read.csv("I:/UBD PB/ZH5102 ML/ass4/05_PPMI_top_UPDRS_Integrated_LongFormat.csv")

3 High Dimensional Space KNN Classification

4 Lower Dimension Space KNN Classification

Try the above protocol again but select only columns 1 to 5 as predictors (after deleting the index and ID columns). Now, what about the \(k\) you select and the error rates for each kind of scaled data (original data, normalized data)? Comment on any interesting observations.

ppmi<-read.csv("I:/UBD PB/ZH5102 ML/ass4/05_PPMI_top_UPDRS_Integrated_LongFormat.csv")
ppmi$pd<-ifelse(ppmi$ResearchGroup=="PD", 1, 0)
ppmi<-ppmi[,c(3:7,72)]
cor(ppmi[,-6])
##                               L_insular_cortex_ComputeArea
## L_insular_cortex_ComputeArea                     1.0000000
## L_insular_cortex_Volume                          0.9869107
## R_insular_cortex_ComputeArea                     0.9342669
## R_insular_cortex_Volume                          0.9223210
## L_cingulate_gyrus_ComputeArea                    0.9066181
##                               L_insular_cortex_Volume
## L_insular_cortex_ComputeArea                0.9869107
## L_insular_cortex_Volume                     1.0000000
## R_insular_cortex_ComputeArea                0.9312231
## R_insular_cortex_Volume                     0.9340757
## L_cingulate_gyrus_ComputeArea               0.9064426
##                               R_insular_cortex_ComputeArea
## L_insular_cortex_ComputeArea                     0.9342669
## L_insular_cortex_Volume                          0.9312231
## R_insular_cortex_ComputeArea                     1.0000000
## R_insular_cortex_Volume                          0.9864854
## L_cingulate_gyrus_ComputeArea                    0.9198314
##                               R_insular_cortex_Volume
## L_insular_cortex_ComputeArea                0.9223210
## L_insular_cortex_Volume                     0.9340757
## R_insular_cortex_ComputeArea                0.9864854
## R_insular_cortex_Volume                     1.0000000
## L_cingulate_gyrus_ComputeArea               0.9128562
##                               L_cingulate_gyrus_ComputeArea
## L_insular_cortex_ComputeArea                      0.9066181
## L_insular_cortex_Volume                           0.9064426
## R_insular_cortex_ComputeArea                      0.9198314
## R_insular_cortex_Volume                           0.9128562
## L_cingulate_gyrus_ComputeArea                     1.0000000
ggpairs(ppmi, columns = 1:5, ggplot2::aes(colour=as.factor(pd)))

ppmi$pd<-as.integer(ppmi$pd)
pz<- as.data.frame(lapply(ppmi, scale))
set.seed(2023)
sub<-sample(nrow(pz), floor(nrow(pz)*2/3)) 
pz_train<-pz[sub, ]  
pz_test<-pz[-sub, ]

Y<-ifelse (pz$pd < 0.5, "No", "Yes")
pz_train_label<-Y[sub]  
pz_test_label<-Y[-sub]  
table(pz_train_label)
## pz_train_label
##  No Yes 
## 148 292
knntuning = tune.knn(x= pz_train, y = as.factor(pz_train_label), k = 1:50)
summary(knntuning) # k=1 not applicable
## 
## Parameter tuning of 'knn.wrapper':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  k
##  1
## 
## - best performance: 0 
## 
## - Detailed performance results:
##     k error dispersion
## 1   1     0          0
## 2   2     0          0
## 3   3     0          0
## 4   4     0          0
## 5   5     0          0
## 6   6     0          0
## 7   7     0          0
## 8   8     0          0
## 9   9     0          0
## 10 10     0          0
## 11 11     0          0
## 12 12     0          0
## 13 13     0          0
## 14 14     0          0
## 15 15     0          0
## 16 16     0          0
## 17 17     0          0
## 18 18     0          0
## 19 19     0          0
## 20 20     0          0
## 21 21     0          0
## 22 22     0          0
## 23 23     0          0
## 24 24     0          0
## 25 25     0          0
## 26 26     0          0
## 27 27     0          0
## 28 28     0          0
## 29 29     0          0
## 30 30     0          0
## 31 31     0          0
## 32 32     0          0
## 33 33     0          0
## 34 34     0          0
## 35 35     0          0
## 36 36     0          0
## 37 37     0          0
## 38 38     0          0
## 39 39     0          0
## 40 40     0          0
## 41 41     0          0
## 42 42     0          0
## 43 43     0          0
## 44 44     0          0
## 45 45     0          0
## 46 46     0          0
## 47 47     0          0
## 48 48     0          0
## 49 49     0          0
## 50 50     0          0
knn_model <- train(x=pz_train, y=pz_train_label , metric = "ROC", method = "knn", tuneLength = 20, trControl = knnControl)
print(knn_model) # k = 43
## k-Nearest Neighbors 
## 
## 440 samples
##   6 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 397, 396, 396, 395, 396, 396, ... 
## Resampling results across tuning parameters:
## 
##   k   ROC  Sens  Spec
##    5  1    1     1   
##    7  1    1     1   
##    9  1    1     1   
##   11  1    1     1   
##   13  1    1     1   
##   15  1    1     1   
##   17  1    1     1   
##   19  1    1     1   
##   21  1    1     1   
##   23  1    1     1   
##   25  1    1     1   
##   27  1    1     1   
##   29  1    1     1   
##   31  1    1     1   
##   33  1    1     1   
##   35  1    1     1   
##   37  1    1     1   
##   39  1    1     1   
##   41  1    1     1   
##   43  1    1     1   
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 43.
pz_test_pred43 <- knn(train=pz_train, test=pz_test,  cl=pz_train_label, prob=T, k=43)
pz_test_predBin43 <- ifelse(attributes(pz_test_pred43)$prob == 1, "No", "Yes")
CT <- CrossTable(x=pz_test_label, y=pz_test_predBin43, prop.chisq = F)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  221 
## 
##  
##               | pz_test_predBin43 
## pz_test_label |        No |       Yes | Row Total | 
## --------------|-----------|-----------|-----------|
##            No |        76 |         3 |        79 | 
##               |     0.962 |     0.038 |     0.357 | 
##               |     0.349 |     1.000 |           | 
##               |     0.344 |     0.014 |           | 
## --------------|-----------|-----------|-----------|
##           Yes |       142 |         0 |       142 | 
##               |     1.000 |     0.000 |     0.643 | 
##               |     0.651 |     0.000 |           | 
##               |     0.643 |     0.000 |           | 
## --------------|-----------|-----------|-----------|
##  Column Total |       218 |         3 |       221 | 
##               |     0.986 |     0.014 |           | 
## --------------|-----------|-----------|-----------|
## 
## 
print(paste0("Prediction accuracy of model 'pz_test_pred' (k=43) is ", (CT$prop.tbl[1,1]+CT1$prop.tbl[2,2]) ))
## [1] "Prediction accuracy of model 'pz_test_pred' (k=43) is 0.648239228801889"
mod43_TN <- CT$prop.row[1, 1]  
mod43_FP <- CT$prop.row[1, 2]
mod43_FN <- CT$prop.row[2, 1]
mod43_TP <- CT$prop.row[2, 2]

mod43_sensi <- mod43_TN/(mod43_TN+mod43_FP) 
mod43_speci <- mod43_TP/(mod43_TP+mod43_FN)

print(paste0("kNN model k=43 Sensitivity=", mod43_sensi))
## [1] "kNN model k=43 Sensitivity=0.962025316455696"
print(paste0("kNN model k=43 Specificity=", mod43_speci))
## [1] "kNN model k=43 Specificity=0"
gmm_clust <- Mclust(pz)
summary(gmm_clust, parameters = TRUE)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEV (ellipsoidal, equal shape) model with 2 components: 
## 
##  log-likelihood   n df       BIC       ICL
##       -371.4151 661 50 -1067.518 -1073.895
## 
## Clustering table:
##   1   2 
## 308 353 
## 
## Mixing probabilities:
##         1         2 
## 0.4670231 0.5329769 
## 
## Means:
##                                    [,1]       [,2]
## L_insular_cortex_ComputeArea  0.6220802 -0.5451002
## L_insular_cortex_Volume       0.6247205 -0.5474138
## R_insular_cortex_ComputeArea  0.6565830 -0.5753334
## R_insular_cortex_Volume       0.6503967 -0.5699127
## L_cingulate_gyrus_ComputeArea 0.6063131 -0.5312842
## pd                            0.7226688 -0.6332413
## 
## Variances:
## [,,1]
##                               L_insular_cortex_ComputeArea
## L_insular_cortex_ComputeArea                     0.1171544
## L_insular_cortex_Volume                          0.1388148
## R_insular_cortex_ComputeArea                     0.1065900
## R_insular_cortex_Volume                          0.1307602
## L_cingulate_gyrus_ComputeArea                    0.1251054
## pd                                               0.0000000
##                               L_insular_cortex_Volume
## L_insular_cortex_ComputeArea                0.1388148
## L_insular_cortex_Volume                     0.1751961
## R_insular_cortex_ComputeArea                0.1268470
## R_insular_cortex_Volume                     0.1637437
## L_cingulate_gyrus_ComputeArea               0.1487934
## pd                                          0.0000000
##                               R_insular_cortex_ComputeArea
## L_insular_cortex_ComputeArea                     0.1065900
## L_insular_cortex_Volume                          0.1268470
## R_insular_cortex_ComputeArea                     0.1365914
## R_insular_cortex_Volume                          0.1632645
## L_cingulate_gyrus_ComputeArea                    0.1179645
## pd                                               0.0000000
##                               R_insular_cortex_Volume
## L_insular_cortex_ComputeArea                0.1307602
## L_insular_cortex_Volume                     0.1637437
## R_insular_cortex_ComputeArea                0.1632645
## R_insular_cortex_Volume                     0.2169759
## L_cingulate_gyrus_ComputeArea               0.1459372
## pd                                          0.0000000
##                               L_cingulate_gyrus_ComputeArea           pd
## L_insular_cortex_ComputeArea                      0.1251054 0.0000000000
## L_insular_cortex_Volume                           0.1487934 0.0000000000
## R_insular_cortex_ComputeArea                      0.1179645 0.0000000000
## R_insular_cortex_Volume                           0.1459372 0.0000000000
## L_cingulate_gyrus_ComputeArea                     0.2296633 0.0000000000
## pd                                                0.0000000 0.0003690069
## [,,2]
##                               L_insular_cortex_ComputeArea
## L_insular_cortex_ComputeArea                      1.490494
## L_insular_cortex_Volume                           1.420801
## R_insular_cortex_ComputeArea                      1.231770
## R_insular_cortex_Volume                           1.176463
## L_cingulate_gyrus_ComputeArea                     1.203296
## pd                                               -0.698225
##                               L_insular_cortex_Volume
## L_insular_cortex_ComputeArea                1.4208013
## L_insular_cortex_Volume                     1.3863118
## R_insular_cortex_ComputeArea                1.1930998
## R_insular_cortex_Volume                     1.1583531
## L_cingulate_gyrus_ComputeArea               1.1702884
## pd                                         -0.6945259
##                               R_insular_cortex_ComputeArea
## L_insular_cortex_ComputeArea                     1.2317695
## L_insular_cortex_Volume                          1.1930998
## R_insular_cortex_ComputeArea                     1.3405068
## R_insular_cortex_Volume                          1.2765749
## L_cingulate_gyrus_ComputeArea                    1.2586407
## pd                                              -0.6794121
##                               R_insular_cortex_Volume
## L_insular_cortex_ComputeArea                1.1764634
## L_insular_cortex_Volume                     1.1583531
## R_insular_cortex_ComputeArea                1.2765749
## R_insular_cortex_Volume                     1.2366622
## L_cingulate_gyrus_ComputeArea               1.2064917
## pd                                         -0.6571546
##                               L_cingulate_gyrus_ComputeArea         pd
## L_insular_cortex_ComputeArea                      1.2032960 -0.6982250
## L_insular_cortex_Volume                           1.1702884 -0.6945259
## R_insular_cortex_ComputeArea                      1.2586407 -0.6794121
## R_insular_cortex_Volume                           1.2064917 -0.6571546
## L_cingulate_gyrus_ComputeArea                     1.3235443 -0.6566522
## pd                                               -0.6566522  1.2582225
## Categorized into two clusters

knn.tune_train = tune.knn(x= pz_train, y = as.factor(pz_train_label), k = 1:50,
                    tunecontrol=tune.control(sampling = "fix") , fix=10)

summary(knn.tune_train)
## 
## Parameter tuning of 'knn.wrapper':
## 
## - sampling method: fixed training/validation set 
## 
## - best parameters:
##  k
##  1
## 
## - best performance: 0 
## 
## - Detailed performance results:
##     k       error dispersion
## 1   1 0.000000000         NA
## 2   2 0.000000000         NA
## 3   3 0.000000000         NA
## 4   4 0.000000000         NA
## 5   5 0.000000000         NA
## 6   6 0.000000000         NA
## 7   7 0.000000000         NA
## 8   8 0.000000000         NA
## 9   9 0.000000000         NA
## 10 10 0.000000000         NA
## 11 11 0.000000000         NA
## 12 12 0.000000000         NA
## 13 13 0.000000000         NA
## 14 14 0.000000000         NA
## 15 15 0.000000000         NA
## 16 16 0.000000000         NA
## 17 17 0.000000000         NA
## 18 18 0.000000000         NA
## 19 19 0.000000000         NA
## 20 20 0.000000000         NA
## 21 21 0.000000000         NA
## 22 22 0.000000000         NA
## 23 23 0.000000000         NA
## 24 24 0.000000000         NA
## 25 25 0.000000000         NA
## 26 26 0.000000000         NA
## 27 27 0.000000000         NA
## 28 28 0.000000000         NA
## 29 29 0.000000000         NA
## 30 30 0.000000000         NA
## 31 31 0.000000000         NA
## 32 32 0.000000000         NA
## 33 33 0.000000000         NA
## 34 34 0.000000000         NA
## 35 35 0.000000000         NA
## 36 36 0.000000000         NA
## 37 37 0.000000000         NA
## 38 38 0.000000000         NA
## 39 39 0.000000000         NA
## 40 40 0.000000000         NA
## 41 41 0.000000000         NA
## 42 42 0.000000000         NA
## 43 43 0.000000000         NA
## 44 44 0.000000000         NA
## 45 45 0.000000000         NA
## 46 46 0.000000000         NA
## 47 47 0.000000000         NA
## 48 48 0.000000000         NA
## 49 49 0.000000000         NA
## 50 50 0.006802721         NA
df <- as.data.frame(cbind(x=knn.tune_train$performance$k, y=knn.tune_train$performance$error))
plot_ly(df, x = ~x, y = ~y, type = "scatter", mode = "markers+lines") %>% 
  add_segments(x=1, xend=1, y=0.0, yend=0.45, type = "scatter", 
               line=list(color="gray", width = 2, dash = 'dot'), mode = "lines", showlegend=FALSE) %>% 
  add_segments(x=5, xend=5, y=0.0, yend=0.45, type = "scatter", 
               line=list(color="lightgray", width = 2, dash = 'dot'), mode = "lines", showlegend=FALSE) %>% 
  layout(title='PPMI k-NN Prediction (Train) - Error Rate against k', 
           xaxis=list(title='Number of nearest neighbors (k)'), yaxis=list(title='Classification error'))
knn.tune_test = tune.knn(x= pz_test, y = as.factor(pz_test_label), k = 1:50,
                    tunecontrol=tune.control(sampling = "fix") , fix=10)
summary(knn.tune_test)
## 
## Parameter tuning of 'knn.wrapper':
## 
## - sampling method: fixed training/validation set 
## 
## - best parameters:
##  k
##  1
## 
## - best performance: 0 
## 
## - Detailed performance results:
##     k      error dispersion
## 1   1 0.00000000         NA
## 2   2 0.00000000         NA
## 3   3 0.00000000         NA
## 4   4 0.00000000         NA
## 5   5 0.00000000         NA
## 6   6 0.00000000         NA
## 7   7 0.00000000         NA
## 8   8 0.00000000         NA
## 9   9 0.00000000         NA
## 10 10 0.00000000         NA
## 11 11 0.00000000         NA
## 12 12 0.00000000         NA
## 13 13 0.00000000         NA
## 14 14 0.00000000         NA
## 15 15 0.00000000         NA
## 16 16 0.00000000         NA
## 17 17 0.00000000         NA
## 18 18 0.00000000         NA
## 19 19 0.00000000         NA
## 20 20 0.00000000         NA
## 21 21 0.00000000         NA
## 22 22 0.00000000         NA
## 23 23 0.00000000         NA
## 24 24 0.00000000         NA
## 25 25 0.00000000         NA
## 26 26 0.00000000         NA
## 27 27 0.00000000         NA
## 28 28 0.00000000         NA
## 29 29 0.00000000         NA
## 30 30 0.00000000         NA
## 31 31 0.00000000         NA
## 32 32 0.00000000         NA
## 33 33 0.00000000         NA
## 34 34 0.00000000         NA
## 35 35 0.00000000         NA
## 36 36 0.00000000         NA
## 37 37 0.00000000         NA
## 38 38 0.00000000         NA
## 39 39 0.02702703         NA
## 40 40 0.01351351         NA
## 41 41 0.02702703         NA
## 42 42 0.04054054         NA
## 43 43 0.04054054         NA
## 44 44 0.02702703         NA
## 45 45 0.02702703         NA
## 46 46 0.06756757         NA
## 47 47 0.08108108         NA
## 48 48 0.10810811         NA
## 49 49 0.09459459         NA
## 50 50 0.10810811         NA
df <- as.data.frame(cbind(x=knn.tune_test$performance$k, y=knn.tune_test$performance$error))
plot_ly(df, x = ~x, y = ~y, type = "scatter", mode = "markers+lines") %>% 
  add_segments(x=1, xend=1, y=0.0, yend=0.45, type = "scatter", 
               line=list(color="gray", width = 2, dash = 'dot'), mode = "lines", showlegend=FALSE) %>% 
  add_segments(x=5, xend=5, y=0.0, yend=0.45, type = "scatter", 
               line=list(color="lightgray", width = 2, dash = 'dot'), mode = "lines", showlegend=FALSE) %>% 
  layout(title='PPMI k-NN Prediction (Test) - Error Rate against k', 
           xaxis=list(title='Number of nearest neighbors (k)'), yaxis=list(title='Classification error'))
## Train and Test data are consistently low at 0 but Test data has an increasing error rate after it reaches K = 39.

### Accuracy is lower for high dimensional data. Low dimensional data has extreme value of very high specificty and very low sensitivity when compared to high dimensional data. Low dimensional data able to classify into 2 clusters whereas high dimensional only allow 1 cluster. Error rate in general are lower in low dimensional data than in high dimensional data and the increasing trend of test data for both dimensions are similar.